package org.ykx.demo.sql

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext

object DataFrameTest {
  val df1DirPath="hdfs://master:8020/tmp/original-data.csv"
  val df2DirPath="hdfs://master:8020/tmp/df_tem.csv"
  
  val conf = new SparkConf().setMaster ("local[*]").setAppName ("DataFrameTest")
  val sc = new SparkContext(conf)
  val sqlContext = new SQLContext(sc)
  
  def main(args: Array[String]): Unit = {
    // 导入语句，可以隐式地将RDD转化成DataFrame
    import sqlContext.implicits._
//    val lines1 = sc.textFile(df1DirPath, 2)
//    val df1 = lines1.map(_.split(",")).filter { x => ??? }map { x => (x.apply(0),x.apply(1),x.apply(2)) }.toDF()
//    df1.show()
    
    val df1 = sqlContext.load("com.databricks.spark.csv", Map("path" -> df1DirPath, "header" -> "true"))
//    df1.withColumnRenamed("STATS_TIME", "date").show()
   val df = df1.withColumn("date", df1.col("STATS_TIME"))
    df.show()
    
    val df2 = sqlContext.load("com.databricks.spark.csv", Map("path" -> df2DirPath, "header" -> "true"))
    df2.show()
    
     //存在重复列问题
    df.join(df2,df.col("date") === df2.col("date")).show()
    
    //去重复列
    df.join(df2,"date").show()
    
//    println("笛卡尔积："+df.join(df2).count())
    
    
    
  }
}